Import Modules¶
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
Read data¶
songs_url = 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv'
df = pd.read_csv(songs_url)
spotify_df = df.copy()
1. Introduction¶
2. EDA¶
a) Basic Information:¶
1 - How many rows and columns?
spotify_df.shape
(32833, 23)
The spotify_df dataset has 32833 rows and 23 columns.
2 - What are the variable names
3 - What are the data types of the variables
spotify_df.dtypes
track_id object track_name object track_artist object track_popularity int64 track_album_id object track_album_name object track_album_release_date object playlist_name object playlist_id object playlist_genre object playlist_subgenre object danceability float64 energy float64 key int64 loudness float64 mode int64 speechiness float64 acousticness float64 instrumentalness float64 liveness float64 valence float64 tempo float64 duration_ms int64 dtype: object
There are 13 numeric variables (9 float, 4 integers) and 10 object variables.
4 - Number of missing values per variable
spotify_df.isna().sum().loc[spotify_df.isna().sum() > 0]
track_name 5 track_artist 5 track_album_name 5 dtype: int64
The three variables, track_name, track_artist, track_album_name are all missing data.
5 - The number of unique values per column
spotify_df.nunique()
track_id 28356 track_name 23449 track_artist 10692 track_popularity 101 track_album_id 22545 track_album_name 19743 track_album_release_date 4530 playlist_name 449 playlist_id 471 playlist_genre 6 playlist_subgenre 24 danceability 822 energy 952 key 12 loudness 10222 mode 2 speechiness 1270 acousticness 3731 instrumentalness 4729 liveness 1624 valence 1362 tempo 17684 duration_ms 19785 dtype: int64
Since there are 32,833 total records and 28,356 unique track_ids, we assume that track_id is most likely represented in many playlists.
THINGS TO CONSIDER
playlist_id and track_album_id can mean that the track/song shows up many times (duplicated). I think I only want the data for the tracks with the most common playlist_genre and subgenre.
def cesar(groups):
return spotify_df.groupby(groups).\
aggregate(num_track_pop_values = ('track_popularity', 'nunique'),
num_valence_values = ('valence', 'nunique'),
num_danceability_values = ('danceability', 'nunique'),
num_energy_values = ('energy', 'nunique'),
num_key_values = ('key', 'nunique'),
num_loudness_values = ('loudness', 'nunique'),
num_mode_values = ('mode', 'nunique'),
num_speechiness_values = ('speechiness', 'nunique'),
num_acousticness_values = ('acousticness', 'nunique'),
num_instrumentalness_values = ('instrumentalness', 'nunique'),
num_liveness_values = ('liveness', 'nunique'),
num_tempo_values = ('tempo', 'nunique'),
num_duration_ms_values = ('duration_ms', 'nunique'),
num_playlist_genre_values = ('playlist_genre', 'count'),
num_playlist_subgenre_values = ('playlist_subgenre', 'count'),
).\
reset_index().\
nunique()
groupby = ['track_id', "playlist_subgenre"]
cesar(groupby)
track_id 28356 playlist_subgenre 24 num_track_pop_values 1 num_valence_values 1 num_danceability_values 1 num_energy_values 1 num_key_values 1 num_loudness_values 1 num_mode_values 1 num_speechiness_values 1 num_acousticness_values 1 num_instrumentalness_values 1 num_liveness_values 1 num_tempo_values 1 num_duration_ms_values 1 num_playlist_genre_values 1 num_playlist_subgenre_values 1 dtype: int64
groupby = ['track_id']
cesar(groupby)
track_id 28356 num_track_pop_values 1 num_valence_values 1 num_danceability_values 1 num_energy_values 1 num_key_values 1 num_loudness_values 1 num_mode_values 1 num_speechiness_values 1 num_acousticness_values 1 num_instrumentalness_values 1 num_liveness_values 1 num_tempo_values 1 num_duration_ms_values 1 num_playlist_genre_values 10 num_playlist_subgenre_values 10 dtype: int64
spotify_df.groupby(['track_id', "track_album_id" , "playlist_id", "playlist_genre", 'playlist_subgenre']).size().reset_index(name='num_rows').num_rows.value_counts()
num_rows 1 32833 Name: count, dtype: int64
Soooooo....the track_id AND playlist_subgenre are UNIQUE pairing! This means that a song could have multiple rows in a playlist if there are multiple subgenres specified.
Sooooo...one song could be in multiple playlists AND within each playlist have a row for each UNIQUE subgenre!
These are variable of interest.
vars_of_interest = ["track_id", "mostcom_playlistgenre"] + [var for var in spotify_df.select_dtypes("number").columns.to_list()]
I am going to convert the variables key and mode to categorical variables.
for var in ["key", "mode"]:
spotify_df[var] = spotify_df[var].astype("category")
spotify_df.dtypes
track_id object track_name object track_artist object track_popularity int64 track_album_id object track_album_name object track_album_release_date object playlist_name object playlist_id object playlist_genre object playlist_subgenre object danceability float64 energy float64 key category loudness float64 mode category speechiness float64 acousticness float64 instrumentalness float64 liveness float64 valence float64 tempo float64 duration_ms int64 dtype: object
DECISION:¶
OUTCOME:
- We are going to convert the continuous
track_popularityto a binary outcome because this variable is bounded and a linear regression would not be appropriate. - We are also going to create the new
track_popularityvariable using the hack and a logit transformation.
spotify_df['binary_outcome'] = np.where(spotify_df.track_popularity > 50, 1, 0)
spotify_df['binary_outcome_60'] = np.where(spotify_df.track_popularity > 60, 1, 0)
spotify_df['binary_outcome_70'] = np.where(spotify_df.track_popularity > 70, 1, 0)
spotify_df['track_pop_shift'] = np.where( spotify_df.track_popularity == 100, spotify_df.track_popularity - 0.1, spotify_df.track_popularity)
spotify_df['track_pop_shift'] = np.where( spotify_df.track_popularity == 0, spotify_df.track_popularity + 0.1, spotify_df.track_pop_shift )
spotify_df.loc[:, ['track_popularity', 'track_pop_shift']].describe()
| track_popularity | track_pop_shift | |
|---|---|---|
| count | 32833.000000 | 32833.000000 |
| mean | 42.477081 | 42.485307 |
| std | 24.984074 | 24.970075 |
| min | 0.000000 | 0.100000 |
| 25% | 24.000000 | 24.000000 |
| 50% | 45.000000 | 45.000000 |
| 75% | 62.000000 | 62.000000 |
| max | 100.000000 | 99.900000 |
spotify_df['track_pop_frac'] = spotify_df.track_pop_shift / 100
spotify_df['track_popularity_tf'] = np.log( spotify_df.track_pop_frac / (1 - spotify_df.track_pop_frac) )
sns.displot(data = spotify_df, x='track_popularity_tf', kind='hist', bins=25, aspect=1.25)
plt.show()
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
SAMPLE SIZE: I have decided to keep a dataset with one track per row.
Since the input features are the same across each row for each track ID.
# spotify_check = spotify_df.groupby(['track_id']).aggregate(num_track_pop_values = ('track_popularity', 'nunique'),
# num_rows = ('track_id', 'count')).reset_index()
# spotify_once_list = spotify_check.loc[spotify_check.num_rows == 1].track_id.to_list()
# spotify_once = spotify_df.loc[spotify_df.track_id.isin(spotify_once_list)].copy()
# spotify_once.track_id.drop_duplicates().count()
spotify_mode_genre = spotify_df.groupby('track_id').agg({
'playlist_genre': lambda x: x.mode().iloc[0], # Most common genre
'playlist_subgenre': lambda x: x.mode().iloc[0], # Most common subgenre
}).reset_index().rename(columns={
'playlist_genre': 'mostcom_playlistgenre',
'playlist_subgenre': 'mostcom_playlistsubgenre'
})
spotify_mode_genre
| track_id | mostcom_playlistgenre | mostcom_playlistsubgenre | |
|---|---|---|---|
| 0 | 0017A6SJgTbfQVU2EtsPNo | rock | classic rock |
| 1 | 002xjHwzEx66OWFV2IP9dk | r&b | neo soul |
| 2 | 004s3t0ONYlzxII9PLgU6z | rock | hard rock |
| 3 | 008MceT31RotUANsKuzy3L | pop | electropop |
| 4 | 008rk8F6ZxspZT4bUlkIQG | pop | dance pop |
| ... | ... | ... | ... |
| 28351 | 7zxRMhXxJMQCeDDg0rKAVo | r&b | urban contemporary |
| 28352 | 7zyLObYw4QUKQDyZOb4J0Y | r&b | new jack swing |
| 28353 | 7zycSpvjDcqh6YT1FEl2kY | pop | electropop |
| 28354 | 7zye9v6B785eFWEFYs13C2 | r&b | neo soul |
| 28355 | 7zzZmpw8L66ZPjH1M6qmOs | rock | classic rock |
28356 rows × 3 columns
vars_of_interest.extend(["binary_outcome", "binary_outcome_60", "binary_outcome_70", "track_popularity_tf"])
vars_of_interest
['track_id', 'mostcom_playlistgenre', 'track_popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'binary_outcome', 'binary_outcome_60', 'binary_outcome_70', 'track_popularity_tf']
y_nums = ["track_popularity", "track_popularity_tf"]
y_cats = ["binary_outcome", "binary_outcome_60", "binary_outcome_70"]
x_cats = ["key", "mode", "mostcom_playlistgenre"]
x_nums = [var for var in vars_of_interest if var not in
["track_id", "key", "mode", "mostcom_playlistgenre", "track_popularity", "track_popularity_tf", "binary_outcome", "binary_outcome_60", "binary_outcome_70"]]
x_nums.sort()
Creating the new dataset so that each record is ONE SONG!
spotify_new = spotify_df.loc[:,["track_id"] + vars_of_interest[2:]].\
merge(spotify_mode_genre, on=["track_id"], how="inner").\
drop_duplicates().copy()
spotify_new
| track_id | track_popularity | danceability | energy | key | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | duration_ms | binary_outcome | binary_outcome_60 | binary_outcome_70 | track_popularity_tf | mostcom_playlistgenre | mostcom_playlistsubgenre | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6f807x0ima9a1j3VPbc7VN | 66 | 0.748 | 0.916 | 6 | -2.634 | 1 | 0.0583 | 0.102000 | 0.000000 | 0.0653 | 0.5180 | 122.036 | 194754 | 1 | 1 | 0 | 0.663294 | edm | dance pop |
| 2 | 0r7CVbZTWZgbTCYdfa2P31 | 67 | 0.726 | 0.815 | 11 | -4.969 | 1 | 0.0373 | 0.072400 | 0.004210 | 0.3570 | 0.6930 | 99.972 | 162600 | 1 | 1 | 0 | 0.708185 | edm | dance pop |
| 4 | 1z1Hg7Vb0AhHDiEmnDE79l | 70 | 0.675 | 0.931 | 1 | -3.432 | 0 | 0.0742 | 0.079400 | 0.000023 | 0.1100 | 0.6130 | 124.008 | 176616 | 1 | 1 | 0 | 0.847298 | edm | big room |
| 7 | 75FpbthrwQmzHlBJLuGdC7 | 60 | 0.718 | 0.930 | 7 | -3.778 | 1 | 0.1020 | 0.028700 | 0.000009 | 0.2040 | 0.2770 | 121.956 | 169093 | 1 | 0 | 0 | 0.405465 | pop | dance pop |
| 8 | 1e8PAfcKUYoKkxPhrHqw4x | 69 | 0.650 | 0.833 | 1 | -4.672 | 1 | 0.0359 | 0.080300 | 0.000000 | 0.0833 | 0.7250 | 123.976 | 189052 | 1 | 1 | 0 | 0.800119 | pop | dance pop |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 32828 | 7bxnKAamR3snQ1VGLuVfC1 | 42 | 0.428 | 0.922 | 2 | -1.814 | 1 | 0.0936 | 0.076600 | 0.000000 | 0.0668 | 0.2100 | 128.170 | 204375 | 0 | 0 | 0 | -0.322773 | edm | progressive electro house |
| 32829 | 5Aevni09Em4575077nkWHz | 20 | 0.522 | 0.786 | 0 | -4.462 | 1 | 0.0420 | 0.001710 | 0.004270 | 0.3750 | 0.4000 | 128.041 | 353120 | 0 | 0 | 0 | -1.386294 | edm | progressive electro house |
| 32830 | 7ImMqPP3Q1yfUHvsdn7wEo | 14 | 0.529 | 0.821 | 6 | -4.899 | 0 | 0.0481 | 0.108000 | 0.000001 | 0.1500 | 0.4360 | 127.989 | 210112 | 0 | 0 | 0 | -1.815290 | edm | progressive electro house |
| 32831 | 2m69mhnfQ1Oq6lGtXuYhgX | 15 | 0.626 | 0.888 | 2 | -3.361 | 1 | 0.1090 | 0.007920 | 0.127000 | 0.3430 | 0.3080 | 128.008 | 367432 | 0 | 0 | 0 | -1.734601 | edm | progressive electro house |
| 32832 | 29zWqhca3zt5NsckZqDf6c | 27 | 0.603 | 0.884 | 5 | -4.571 | 0 | 0.0385 | 0.000133 | 0.341000 | 0.7420 | 0.0894 | 127.984 | 337500 | 0 | 0 | 0 | -0.994623 | edm | progressive electro house |
28356 rows × 20 columns
1 - Counts of categorical variables
for var in x_cats:
sns.catplot(data=spotify_new, x=var, hue=var, kind="count", aspect=3, palette="coolwarm")
plt.title("Bar Chart of %s" % var, fontsize=16, fontweight="bold")
if var == "playlist_subgenre":
# Set x-axis labels to be horizontal
plt.xticks(rotation=45)
plt.show()
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
2 - Distributions of continuous variables
for var in x_nums:
sns.displot(data=spotify_new, x=var, kind="hist", common_norm=False, bins=15, kde=True, aspect=1.5)
plt.title("Histogram of %s" % var, fontsize=14, fontweight="bold")
plt.show()
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
# for var in spotify_nums:
# sns.catplot(data=spotify_new, y=var, kind="box", aspect=1.5)
# plt.title("Boxplots of %s" % var, fontsize=14, fontweight="bold")
# plt.show()
sns.catplot(data=spotify_new.loc[:, x_nums], kind="box", aspect=3)
plt.title("Boxplots of All Numerical Vars", fontsize=14, fontweight="bold")
plt.show()
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
Plotting Raw Data¶
3 - Relationships between continuous variables
Heatmaps - Correlation Plots
corr_matrix_raw = spotify_new.loc[:,x_nums].corr(numeric_only=True)
fig, ax = plt.subplots(figsize=(18,6))
sns.heatmap(data=corr_matrix_raw,
vmin=-1,
vmax=1,
center=0,
annot=True,
annot_kws={"fontsize": 10},
cmap="coolwarm",
ax=ax)
ax.set_title("Correlation Plots of All Numeric Variables - Raw Data", fontsize=14, fontweight="bold")
plt.show()
Correlations higher than abs(.30) are:
- Loudness with Energy (0.68)
- Valence with Danceability (0.33)
- Acousticness with Energy (-0.55)
- Acousticness with Loudness (-0.37)
Seeing if Correlation changes between numeric variables by categorical inputs
for var in x_cats:
var_groups = np.sort(spotify_new[var].unique()).tolist()
corr_groups = spotify_new.loc[:, [var]+x_nums].groupby(var).corr()
fig, axs = plt.subplots(len(var_groups), 1, figsize=(16,45), sharex=True, sharey=True)
for ix in range(len(var_groups)):
sns.heatmap(data=(corr_groups.loc[var_groups[ix]]),
vmin=-1,
vmax=1,
center=0,
annot=True,
annot_kws={"fontsize": 8},
cmap="coolwarm",
ax=axs[ix])
axs[ix].set_title("Correlation Plots of All Numeric Variables by %s" % (var_groups[ix]), fontsize=14, fontweight="bold")
plt.show()
The correlations seem very similar between groups of key and mode, but are different with playlist_genre.
Pairs Plot
sns.pairplot(data=spotify_new.loc[:, x_nums],
diag_kind="kde",
diag_kws={"common_norm": False})
plt.show()
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
4 - Summaries of the continuous variables grouped by categorical variables
Point Plots
for var2 in x_cats:
for var in x_nums:
sns.catplot(data=spotify_new, x=var2, y=var, hue=var2, kind="point", palette="coolwarm", linestyle="none", aspect=2)
if var2 == "mostcom_playlistsubgenre":
# Set x-axis labels to be horizontal
plt.xticks(rotation=45)
plt.title("Point Plots of %s by %s" % (var, var2), fontsize=14, fontweight="bold")
plt.show()
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
Violin Plots
for var2 in x_cats:
for var in x_nums:
sns.catplot(data=spotify_new, x=var2, y=var, kind="violin", hue=var2, palette="coolwarm", aspect=2)
if var2 == "mostcom_playlistsubgenre":
# Set x-axis labels to be horizontal
plt.xticks(rotation=45)
plt.title("Violin Plots of %s by %s" % (var, var2), fontsize=14, fontweight="bold")
plt.show()
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
6 - Scatterplots looking at continuous outcome with continuous inputs.
Summarize the response with boxplots for the unique values of the categorical inputs
Trend Plots
for var in x_nums:
sns.lmplot(data=spotify_new, x="track_popularity_tf", y=var, scatter_kws={'alpha': 0.5},
line_kws={'color': 'orange', 'alpha': 1, 'linewidth': 2})
plt.title("Trend Plots of %s by Track Popularity" % (var), fontsize=14, fontweight="bold")
plt.show()
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
Boxplots
for var in x_cats:
sns.catplot(data=spotify_new, y="track_popularity_tf", x=var, hue=var, palette="coolwarm", kind="box", aspect=2)
plt.title("Box Plots of Track Popularity by %s" % (var), fontsize=14, fontweight="bold")
if var == "mostcom_playlistsubgenre":
# Set x-axis labels to be horizontal
plt.xticks(rotation=45)
plt.show()
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
Point Plots
for var in x_cats:
sns.catplot(data=spotify_new, y="track_popularity_tf", x=var, kind="point", palette="coolwarm", hue=var, aspect=2)
plt.title("Point Plots of Track Popularity by %s" % (var), fontsize=14, fontweight="bold")
plt.show()
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
7 - Visualize histograms and relationships between continuous inputs broken up by the outcome unique values.
7 - Count the Number of Observations for Each Combination of Outcome and Categorical Variables
KDE Plots of Continuous Vars by Outcome
for var in x_nums:
sns.displot(data=spotify_new, x=var, hue='binary_outcome', kind="kde", common_norm=False)
plt.title("Histogram of "+ var + " by Outcome")
plt.show()
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
Pairs Plots of Continuous Vars by Outcome
sns.pairplot(data=spotify_new.loc[:, ["binary_outcome"] + x_nums],
hue="binary_outcome",
diag_kind="kde",
diag_kws={"common_norm": False})
plt.show()
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
Boxplots of Continuous Vars by Outcome
for var in x_nums:
sns.catplot(data=spotify_new, y=var, x="binary_outcome", hue='binary_outcome', kind="box", aspect=2)
plt.title("Boxplots of "+ var + " by Outcome")
plt.show()
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
Point Plots of Continuous Vars by Outcome
for var in x_nums:
sns.catplot(data=spotify_new, y=var, x="binary_outcome", hue='binary_outcome', kind="point", aspect=2)
plt.title("Point Plots of "+ var + " by Outcome")
plt.show()
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
TRANSFORMATION: I want to transform these variables and replot the histograms!¶
The following are showing skewness:
- Left:
- Danceability
- Energy
- Loudness
- Right:
- Speechiness
- Acousticness
- Instrumentalness
- Liveness
spotify_transf = spotify_new.copy()
spotify_transf.loc[:, x_nums].describe()
| acousticness | danceability | duration_ms | energy | instrumentalness | liveness | loudness | speechiness | tempo | valence | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 28356.000000 | 28356.000000 | 28356.000000 | 28356.000000 | 28356.000000 | 28356.000000 | 28356.000000 | 28356.000000 | 28356.00000 | 28356.000000 |
| mean | 0.177176 | 0.653372 | 226575.967026 | 0.698388 | 0.091117 | 0.190958 | -6.817696 | 0.107954 | 120.95618 | 0.510387 |
| std | 0.222803 | 0.145785 | 61078.450819 | 0.183503 | 0.232548 | 0.155894 | 3.036243 | 0.102556 | 26.95456 | 0.234340 |
| min | 0.000000 | 0.000000 | 4000.000000 | 0.000175 | 0.000000 | 0.000000 | -46.448000 | 0.000000 | 0.00000 | 0.000000 |
| 25% | 0.014375 | 0.561000 | 187742.000000 | 0.579000 | 0.000000 | 0.092600 | -8.309250 | 0.041000 | 99.97200 | 0.329000 |
| 50% | 0.079700 | 0.670000 | 216933.000000 | 0.722000 | 0.000021 | 0.127000 | -6.261000 | 0.062600 | 121.99300 | 0.512000 |
| 75% | 0.260000 | 0.760000 | 254975.250000 | 0.843000 | 0.006570 | 0.249000 | -4.709000 | 0.133000 | 133.99900 | 0.695000 |
| max | 0.994000 | 0.983000 | 517810.000000 | 1.000000 | 0.994000 | 0.996000 | 1.275000 | 0.918000 | 239.44000 | 0.991000 |
For variables that are bounded between 0 and 1.
logtf_list = ["acousticness", "danceability", "energy", "speechiness", "acousticness", "instrumentalness", "liveness"]
for var in logtf_list:
spotify_transf[var+"_shift"] = np.where(spotify_transf[var] == 1.0, spotify_transf[var] - 0.01, spotify_transf[var])
spotify_transf[var+"_shift"] = np.where(spotify_transf[var] == 0.0, spotify_transf[var] + 0.01, spotify_transf[var+"_shift"])
spotify_transf[var+'_logit'] = np.log( spotify_transf[var+"_shift"] / (1 - spotify_transf[var+"_shift"]) )
spotify_transf[var+'_log'] = np.log( spotify_transf[var+"_shift"] )
extra_list = ["loudness"]
for var in extra_list:
spotify_transf[var+'_sqrd'] = ( spotify_transf[var] )**2
spotify_transf[var+'_cubed'] = ( spotify_transf[var] )**3
spotify_transf.loc[:,[var + "_shift" for var in logtf_list]].describe()
| acousticness_shift | danceability_shift | energy_shift | speechiness_shift | acousticness_shift | instrumentalness_shift | liveness_shift | |
|---|---|---|---|---|---|---|---|
| count | 28356.000000 | 28356.000000 | 28356.000000 | 28356.000000 | 28356.000000 | 28356.000000 | 28356.000000 |
| mean | 0.177176 | 0.653373 | 0.698386 | 0.107954 | 0.177176 | 0.094705 | 0.190959 |
| std | 0.222803 | 0.145784 | 0.183501 | 0.102556 | 0.222803 | 0.231188 | 0.155894 |
| min | 0.000001 | 0.010000 | 0.000175 | 0.010000 | 0.000001 | 0.000001 | 0.009360 |
| 25% | 0.014375 | 0.561000 | 0.579000 | 0.041000 | 0.014375 | 0.000229 | 0.092600 |
| 50% | 0.079700 | 0.670000 | 0.722000 | 0.062600 | 0.079700 | 0.010000 | 0.127000 |
| 75% | 0.260000 | 0.760000 | 0.843000 | 0.133000 | 0.260000 | 0.010000 | 0.249000 |
| max | 0.994000 | 0.983000 | 0.999000 | 0.918000 | 0.994000 | 0.994000 | 0.996000 |
for var in logtf_list:
sns.catplot(data = spotify_transf.loc[:, [var, var+'_shift']], kind='box')
plt.show()
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
/Applications/anaconda3/envs/cmpinf2100/lib/python3.8/site-packages/seaborn/axisgrid.py:123: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
# epsilon = 1e-9
# for var in ["speechiness", "acousticness", "instrumentalness", "liveness"]:
# spotify_transf[var+"_recip"] = (1/spotify_transf[var])
# spotify_transf[var+"_log"] = np.log(spotify_transf[var] + epsilon)
# spotify_transf[var+"_log10"] = np.log10(spotify_transf[var] + epsilon)
# spotify_transf[var+"_log1x10"] = np.log10(spotify_transf[var] + 1)
# spotify_transf[var+"_sqrt"] = np.sqrt(spotify_transf[var])
# pt = PowerTransformer(method='yeo-johnson')
# spotify_transf[var + "_yeojohnson"] = pt.fit_transform(spotify_transf[[var]])
# spotify_transf[var+"_sqrd"] = (spotify_transf[var])**2
# spotify_transf[var+"_cubed"] = (spotify_transf[var])**3
# spotify_transf[var+'_expo'] = np.exp(spotify_transf[var])
# spotify_transf[var+"_reflected"] = spotify_transf[var].max() - spotify_transf[var]
# spotify_transf[var+'_log_reflected'] = np.log(spotify_transf[var+'_reflected'] + epsilon)
# spotify_transf[var+"_recipsqrt"] = 1 / np.sqrt(spotify_transf[var] + 1e-9)
# spotify_transf[var+"_arcsin"] = np.arcsin(np.sqrt(spotify_transf[var]))
# spotify_transf[var+"_logit"] = np.log((spotify_transf[var] + epsilon) / (1 - spotify_transf[var] + epsilon))
# transformed_data, fitted_lambda = boxcox(spotify_transf[var] + epsilon) # Adding small constant for safety
# spotify_transf[var + "_boxcox"] = transformed_data
For left skewed variables (danceability, energy, and loudness), we will do a variety of transformations.
# for var in ["danceability", "energy", "loudness"]:
# spotify_transf[var+"_sqrd"] = (spotify_transf[var])**2
# spotify_transf[var+"_expo"] = np.exp(spotify_transf[var])
Since I cannot transform the loudness variable I am going to do Winsorization (capping the outliers).
# def iqrcap_winsor(input, var):
# Q1 = spotify_transf[var].quantile(0.25)
# Q3 = spotify_transf[var].quantile(0.75)
# IQR = Q3 - Q1
# # Calculate the lower and upper bounds
# lower_bound = Q1 - 1.5 * IQR
# upper_bound = Q3 + 1.5 * IQR
# input[var+"_iqrcap"] = input[var].clip(lower=lower_bound, upper=upper_bound)
# iqrcap_winsor(spotify_transf, "loudness")
Review the Histograms of Tranformed Variables
# logtf_list = ["danceability", "energy", "acousticness", "instrumentalness"]
x_nums_std = [var + "_logit" for var in logtf_list ] + [var + "_log" for var in logtf_list] + ["loudness_sqrd", "loudness_cubed"]
for varn in x_nums_std:
sns.displot(data=spotify_transf,
x=varn,
kind="hist",
common_norm=False,
bins=11,
kde=True,
aspect=1.5)
plt.title("Histogram of %s" % varn, fontsize=14, fontweight="bold")
plt.show()
from scipy import stats
for var in var_list:
data = spotify_transf.loc[:, var]
stat, p_value = stats.shapiro(data)
print("Var: %s, Shapiro-Wilk p-value: %f" % (var, p_value))
stats.probplot(data, dist="norm", plot=plt)
plt.show()
Variables to use after transforming:
- Track_popularity
- Danceability_logit
- Energy_logit
- Instrumentalness_logit
- Acousticness_logit
- Liveness_log
- Valence
- Duration_ms
PRE-PROCESSING BEFORE STANDARDIZING
var_list
spotify_clean = spotify_transf.loc[:, var_list].copy()
spotify_clean.dropna(inplace=True)
spotify_clean
sns.catplot(data=pd.DataFrame(spotify_clean, columns=spotify_clean.columns), kind="box", aspect=3)
plt.xticks(rotation=45)
plt.show()
from sklearn.preprocessing import StandardScaler
Xspot = StandardScaler().fit_transform(spotify_clean)
sns.catplot(data=pd.DataFrame(Xspot, columns=spotify_clean.columns), kind="box", aspect=3)
plt.xticks(rotation=45)
plt.show()
spotifyclean_df = pd.DataFrame(Xspot, columns=spotify_clean.columns, index=spotify_clean.index)
spotifyclean_df["track_id"] = pd.Series(spotify_transf["track_id"], index=spotify_clean.index)
spotifyclean_df["binary_outcome"] = pd.Series(spotify_transf["binary_outcome"], index=spotify_clean.index)
spotifyclean_df["binary_outcome_60"] = pd.Series(spotify_transf["binary_outcome_60"], index=spotify_clean.index)
spotifyclean_df["binary_outcome70"] = pd.Series(spotify_transf["binary_outcome_70"], index=spotify_clean.index)
spotifyclean_df["mostcom_playlistgenre"] = pd.Series(spotify_transf["mostcom_playlistgenre"], index=spotify_clean.index)
spotifyclean_df["mostcom_playlistsubgenre"] = pd.Series(spotify_transf["mostcom_playlistsubgenre"], index=spotify_clean.index)
spotifyclean_df["key"] = pd.Series(spotify_transf["key"], index=spotify_clean.index)
spotifyclean_df["mode"] = pd.Series(spotify_transf["mode"], index=spotify_clean.index)
spotifyclean_df.describe()
Plotting Transformed and Standardized Data¶
3 - Relationships between continuous variables
Heatmaps - Correlation Plots
corr_matrix = spotifyclean_df.loc[:, ["binary_outcome"]+var_list].corr()
fig, ax = plt.subplots(figsize=(18,6))
sns.heatmap(data=corr_matrix,
vmin=-1,
vmax=1,
center=0,
annot=True,
annot_kws={"fontsize": 10},
cmap="coolwarm",
ax=ax)
ax.set_title("Correlation Plots of All Numeric Variables", fontsize=16, fontweight="bold")
plt.show()
Correlations higher than abs(.30) are:
- Valence with Danceability (0.32)
- Acousticness with Energy (-0.52)
Seeing if Correlation changes between numeric variables by categorical inputs
heatlist = ["key", "mode", "mostcom_playlistgenre"]
for var in heatlist:
var_groups = np.sort(spotifyclean_df[var].unique()).tolist()
corr_groups = spotifyclean_df.loc[:, [var]+var_list[:11]].groupby(var).corr()
fig, axs = plt.subplots(len(var_groups), 1, figsize=(16,45), sharex=True, sharey=True)
for ix in range(len(var_groups)):
sns.heatmap(data=(corr_groups.loc[var_groups[ix]]),
vmin=-1,
vmax=1,
center=0,
annot=True,
annot_kws={"fontsize": 8},
cmap="coolwarm",
ax=axs[ix])
axs[ix].set_title("Correlation Plots of All Numeric Variables by %s" % (var_groups[ix]), fontsize=14, fontweight="bold")
plt.show()
The correlations seem very similar between groups of key and mode, but are different with playlist_genre. This is what we saw with the raw data.
Pairs Plots
sns.pairplot(data=spotifyclean_df.loc[:, var_list],
diag_kind="kde",
diag_kws={"common_norm": False})
plt.show()
4 - Summaries of the continuous variables grouped by categorical variables
Point Plots
for var2 in cat_list[:-1]:
for var in var_list[1:]:
sns.catplot(data=spotifyclean_df, x=var2, y=var, hue=var2, kind="point", palette="coolwarm", linestyle="none", aspect=2)
if var2 == "mostcom_playlistsubgenre":
# Set x-axis labels to be horizontal
plt.xticks(rotation=45)
plt.title("Point Plots of %s by %s" % (var, var2), fontsize=14, fontweight="bold")
plt.show()
Box Plots
for var2 in cat_list[:-1]:
for var in var_list[1:]:
sns.catplot(data=spotifyclean_df, x=var2, y=var, kind="box", hue=var2, palette="coolwarm", aspect=2)
if var2 == "mostcom_playlistsubgenre":
# Set x-axis labels to be horizontal
plt.xticks(rotation=45)
plt.title("Box Plots of %s by %s" % (var, var2), fontsize=14, fontweight="bold")
plt.show()
Violin Plots
for var2 in cat_list[:-1]:
for var in var_list[1:]:
sns.catplot(data=spotifyclean_df, x=var2, y=var, kind="violin", hue=var2, palette="coolwarm", aspect=2)
if var2 == "mostcom_playlistsubgenre":
# Set x-axis labels to be horizontal
plt.xticks(rotation=45)
plt.title("Violin Plots of %s by %s" % (var, var2), fontsize=14, fontweight="bold")
plt.show()
6 - Scatterplots looking at continuous outcome with continuous inputs.
Summarize the response with boxplots for the unique values of the categorical inputs
Trend Plots
for var in var_list[1:]:
sns.lmplot(data=spotifyclean_df, x="track_popularity_tf", y=var, scatter_kws={'alpha': 0.5},
line_kws={'color': 'orange', 'alpha': 1, 'linewidth': 2})
plt.title("Trend Plots of %s by Track Popularity" % (var), fontsize=14, fontweight="bold")
plt.show()
Boxplots
for var in ["key", "mode", "mostcom_playlistgenre"]:
sns.catplot(data=spotifyclean_df, y="track_popularity_tf", x=var, hue=var, palette="coolwarm", kind="box", aspect=2)
plt.title("Box Plots of Track Popularity by %s" % (var), fontsize=14, fontweight="bold")
if var == "mostcom_playlistsubgenre":
# Set x-axis labels to be horizontal
plt.xticks(rotation=45)
plt.show()
Point Plots
for var in ["key", "mode", "mostcom_playlistgenre"]:
sns.catplot(data=spotifyclean_df, y="track_popularity_tf", x=var, kind="point", palette="coolwarm", hue=var, aspect=2)
plt.title("Point Plots of Track Popularity by %s" % (var), fontsize=14, fontweight="bold")
plt.show()
7 - Visualize histograms and relationships between continuous inputs broken up by the outcome unique values.
7 - Count the Number of Observations for Each Combination of Outcome and Categorical Variables
KDE Plots of Continuous Vars by Outcome
for var in var_list[1:]:
sns.displot(data=spotifyclean_df, x=var, hue='binary_outcome', kind="kde", common_norm=False)
plt.title("Histogram of "+ var + " by Outcome")
plt.show()
Pairs Plots of Continuous Vars by Outcome
sns.pairplot(data=spotifyclean_df.loc[:, ["binary_outcome"]+var_list[1:]], hue="binary_outcome", diag_kind="kde", diag_kws={"common_norm": False})
plt.show()
Boxplots of Continous Vars by Outcome
for var in var_list[1:]:
sns.catplot(data=spotifyclean_df, y=var, x="binary_outcome", hue='binary_outcome', kind="box", aspect=2)
plt.title("Boxplots of "+ var + " by Outcome")
plt.show()
Point Plots of Continuous Vars by Outcome
for var in var_list[1:]:
sns.catplot(data=spotifyclean_df, y=var, x="binary_outcome", hue='binary_outcome', kind="point", aspect=2)
plt.title("Point Plots of "+ var + " by Outcome")
plt.show()
Count the Number of Observations for Each Combination of Outcome and Categorical Variables
for var in cat_list[:-1]:
# Drop rows with missing values for the current variable and binary outcome
df_clean = spotifyclean_df.dropna(subset=[var, 'binary_outcome'])
chi2, p_value, dof, expected = stats.chi2_contingency(pd.crosstab(df_clean['binary_outcome'], df_clean[var]))
print("Variable: %s, Chi-Square Value: %f, P-value: %f" % (var, chi2, p_value))
# Create subplots for heatmap and count plot
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
# Heatmap of the contingency table
sns.heatmap(data=pd.crosstab(df_clean[var], df_clean['binary_outcome'], margins=True),
ax=ax[0], annot=True, cmap='coolwarm', fmt='d')
ax[0].set_title(f"Heatmap of {var} by Outcome", fontsize=12, fontweight="bold")
# Count plot (Dodge Bar Chart)
sns.countplot(data=df_clean, x=var, hue='binary_outcome', palette="coolwarm", dodge=True, ax=ax[1])
ax[1].set_title(f"Dodge Bar Chart of {var} by Outcome", fontsize=12, fontweight="bold")
if var == "mostcom_playlistsubgenre":
# Set x-axis labels to be horizontal
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
8 - Additional Plots
Comparing Categorical Inputs
import itertools
var_pairs = list(itertools.combinations(cat_list[:-1], 2))
for var, var2 in var_pairs:
# Drop rows with missing values for the current variable pair and binary outcome
df_clean = spotifyclean_df.dropna(subset=[var, var2])
# Perform Chi-square test
chi2, p_value, dof, expected = stats.chi2_contingency(pd.crosstab(df_clean[var2], df_clean[var]))
print("Variables: %s, Chi-Square Value: %f, P-value: %f" % (var, chi2, p_value))
# Create subplots for heatmap and count plot
fig, ax = plt.subplots(2, 1, figsize=(14, 14))
# Heatmap of the contingency table
sns.heatmap(data=pd.crosstab(df_clean[var], df_clean[var2], margins=True),
ax=ax[0], annot=True, cmap='coolwarm', fmt='d')
ax[0].set_title(f"Heatmap of {var} by {var2}", fontsize=12, fontweight="bold")
# Count plot (Dodge Bar Chart)
sns.countplot(data=df_clean, x=var, hue=var2, palette="coolwarm", dodge=True, ax=ax[1])
ax[1].set_title(f"Dodge Bar Chart of {var} by {var2}", fontsize=12, fontweight="bold")
if var == "mostcom_playlistsubgenre":
# Set x-axis labels to be horizontal for better readability
plt.xticks(rotation=45)
# Adjust layout and display the plots
plt.tight_layout()
plt.show()
Plotting Trend Plots for Logistic Regressions
for var in var_list[1:]:
sns.lmplot(data=spotifyclean_df, x=var, y="binary_outcome", logistic=True)
plt.show()
17850/spotifyclean_df.shape[0]
10506/spotifyclean_df.shape[0]
Variables that are potential inputs:
- Continuous:
- Danceability (danceability_logit)
- Energy (energy_logit)
- Accousticness (accousticness_logit)
- Liveness (liveness_lg)
- Valence
- Duration (duration_ms)
I included these specific variables because they had trends with the track popularity variable, and the averages differed between binary outcome groups on the point plots.
- Categorical:
- Key
- Playlist genre (mostcom_playlistgenre)
I included these variables because there seem to be more non-events (63%) than events (37%) overall in each of these variables. I choose key because it seems to have differences in averages of the continuous inputs. The playlist_genre variable seems to also show differences in the continuous inputs on both boxplots and point plots and when looking at the average of the track_popularity_tf.
spotify_var_interest = [item for item in (var_list + cat_list) if item not in ["mode", "instrumentalness_logit"]]
spotify_var_interest
3. Clustering¶
from sklearn.cluster import KMeans
from scipy.cluster import hierarchy
Create a copy of the main dataset so that we can add Hierarchy Clusters later.
spotifyclean_df_copy = spotifyclean_df.copy()
Drop the NAs
spotify_cluster = spotifyclean_df.dropna().copy()
Keep the numeric features.
spotify_features = spotify_cluster.loc[:, spotify_var_interest[1:-3]].copy()
spotify_features
Plotting the Total Within Sum of Squares vs Number of Clusters to see whats the best clusters
tots_within = []
K = range(1, 31)
for k in K:
km = KMeans(n_clusters=k, random_state=2100, n_init=25, max_iter=500).fit(spotify_features)
tots_within.append(km.inertia_)
Fig, ax = plt.subplots()
ax.plot(K, tots_within, "bo-")
ax.set_xlabel("number of clusters")
ax.set_ylabel("total within sum of squares")
plt.show()
Hierarchical Clustering
hclust_ward = hierarchy.ward(spotify_features)
plt.figure(figsize=(10, 7))
dn = hierarchy.dendrogram(hclust_ward, no_labels=True)
plt.title("Hierarchical Clustering Dendrogram", fontsize=14, fontweight="bold")
plt.xlabel("Sample Index or (Cluster Size)")
plt.ylabel("Distance")
plt.show()
Based on both the plots, the best number of clusters is 3.
# hclust_group = hierarchy.cut_tree(hclust_ward, n_clusters=4).ravel()
hclust_group = hierarchy.cut_tree(hclust_ward, height=150).ravel()
spotifyclean_df_copy["hclustgroup"] = pd.Series(hclust_group, index=spotifyclean_df_copy.index).astype("category")
spotifyclean_df_copy.hclustgroup.value_counts(normalize=True)
1 - After identifying the optimal number of clusters, compare the cluster assignments to unique values of several of the categorical inputs
for var in spotify_var_interest[-3:-1]:
# Chi-Square Test
df_clean = spotifyclean_df_copy.dropna(subset=[var, 'hclustgroup'])
chi2, p_value, dof, expected = stats.chi2_contingency(pd.crosstab(df_clean['hclustgroup'], df_clean[var]))
print("Variable: %s, Chi-Square Value: %f, P-value: %f" % (var, chi2, p_value))
# Create subplots for heatmap and count plot
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
# Heatmap of the contingency table
sns.heatmap(data=pd.crosstab(df_clean[var], df_clean['hclustgroup'], margins=True),
ax=ax[0], annot=True, cmap='coolwarm', fmt='d')
ax[0].set_title(f"Heatmap of {var} by Cluster Assignment", fontsize=12, fontweight="bold")
# Count plot (Dodge Bar Chart)
sns.countplot(data=df_clean, x=var, hue='hclustgroup', palette="coolwarm", dodge=True, ax=ax[1])
ax[1].set_title(f"Dodge Bar Chart of {var} by Cluster Assignment", fontsize=12, fontweight="bold")
if var == "mostcom_playlistsubgenre":
# Set x-axis labels to be horizontal
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
2/3 - Summarize the continuous inputs associated with each of the cluster assignments
Box Plots comparing the Cluster groups with each variable of interest
for var in spotify_var_interest[:-3]:
sns.catplot(data=spotifyclean_df_copy, x="hclustgroup", hue="hclustgroup", y=var, kind="box", aspect=2)
plt.title("Boxplots of "+ var + " by Cluster Groups")
plt.show()
Point Plots comparing the Cluster groups with each variable of interest
for var in spotify_var_interest[:-3]:
sns.catplot(data=spotifyclean_df_copy, x="hclustgroup", hue="hclustgroup", y=var, kind="point", aspect=2)
plt.title("Point Plots of "+ var + " by Cluster Groups")
plt.show()
Pair Plots comparing the Cluser groups for all the variables
sns.pairplot(data=spotifyclean_df_copy.loc[:, ["hclustgroup"]+spotify_var_interest[:-3]],
hue="hclustgroup",
diag_kws={"common_norm":False})
plt.show()
Trend Plots for Logistic Regressions stratified by categorical inputs
for var2 in ["key", "mostcom_playlistgenre", "hclustgroup"]:
for var in spotify_var_interest[1:-3]:
sns.lmplot(data=spotifyclean_df_copy, x=var, y="binary_outcome", hue=var2, logistic=True, ci=None)
plt.title("Logistic Regression Trend Plots of "+ var + " by " + var2)
plt.show()
4 - Compare your cluster assignments to the outcome unique values
Heatmap Comparing Cluster Assignments with Binary Outcome
# Chi-Square Test
chi2, p_value, dof, expected = stats.chi2_contingency(pd.crosstab(df_clean['hclustgroup'], df_clean["binary_outcome"]))
print("Variable: Cluster Assignment, Chi-Square Value: %f, P-value: %f" % (chi2, p_value))
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
sns.heatmap(data=pd.crosstab(spotifyclean_df_copy.binary_outcome, spotifyclean_df_copy.hclustgroup, margins=True),
annot=True,
annot_kws={"fontsize": 10},
cmap="coolwarm",
fmt="d",
ax=ax[0])
ax[0].set_title(f"Heatmap of Cluster Assignments and Outcome", fontsize=12, fontweight="bold")
# Count plot (Dodge Bar Chart)
sns.countplot(data=spotifyclean_df_copy, x="hclustgroup", hue='binary_outcome', palette="coolwarm", dodge=True, ax=ax[1])
ax[1].set_title(f"Dodge Bar Chart of Cluster Assignments by Outcome", fontsize=12, fontweight="bold")
plt.tight_layout()
plt.show()
sns.catplot(data=spotifyclean_df_copy, x="hclustgroup", hue="hclustgroup", kind="count", palette="coolwarm", aspect=1.5)
plt.title(f"Bar Chart of Cluster Assignments", fontsize=12, fontweight="bold")
plt.show()
spotifyclean_df_copy.hclustgroup.value_counts(normalize=True)
After the Cluster Analysis, I want to keep the following as inputs:
- Continuous:
- Danceability
- Energy
- Acousticness
- Instrumentalness
- Duration (ms)
- Categorical
- Key
- Playlist genre
LASSO Regularization to determine predictive features
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
var_list[1:]
# Assuming you have a DataFrame `spotifyclean_df_copy` and a list of variable names `spotify_var_interest`
x = spotifyclean_df_copy.loc[:, spotify_var_interest[1:-3]] # Features (excludes first and last 3 columns if needed)
y = spotifyclean_df_copy['binary_outcome'] # Target variable
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2100)
# Fit LASSO (Logistic Regression with L1 regularization)
lasso = LogisticRegressionCV(penalty='l1', solver='saga', cv=50) # 'saga' solver supports L1 regularization
lasso.fit(X_train, y_train)
# Print the coefficients of the features
print("Feature coefficients:", lasso.coef_)
# Check which features are selected (non-zero coefficients)
selected_features = X_train.columns[(lasso.coef_.ravel() != 0)].tolist() # ravel to make it 1D
print("Selected features:", selected_features)
4. Models: Fitting and Interpretation¶
import statsmodels.formula.api as smf
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2100)
# "binary_outcome ~ danceability_logit * energy_logit * acousticness_logit * instrumentalness_logit * duration_ms",
formula_list = ["binary_outcome ~ 1", # intercept only
"binary_outcome ~ key + mostcom_playlistgenre", #categorical inputs with linear additive features
"binary_outcome ~ danceability_logit + energy_logit + acousticness_logit + instrumentalness_logit + duration_ms", # continuous inputs with linear additive features
"binary_outcome ~ danceability_logit + energy_logit + acousticness_logit + instrumentalness_logit + duration_ms + mostcom_playlistgenre + key", # cat and continuous with linear additive features
"binary_outcome ~ (danceability_logit + energy_logit + acousticness_logit + instrumentalness_logit + duration_ms)**2",
"binary_outcome ~ (danceability_logit + energy_logit + acousticness_logit + instrumentalness_logit + duration_ms) * (mostcom_playlistgenre + key)",
"binary_outcome ~ (poly(danceability_logit, degree=2) + poly(energy_logit, degree=2) + poly(acousticness_logit, degree=2) + poly(instrumentalness_logit, degree=2) + poly(duration_ms, degree=2)) * (mostcom_playlistgenre + key)"
]
input_names = spotify_var_interest[1:-1]
input_names
output_name = "binary_outcome"
output_name
Function from class to calculate performance metrics for a Logistic Regression.
def train_and_test_logistic_with_cv(mod_name, a_formula, data_df, x_names, y_name, cv, threshold=0.5):
# separate the inputs and output
input_df = data_df.loc[:, x_names].copy()
# intitialize the performance metric storage lists
train_res = []
test_res = []
# SPLIT the data and iterate over the folds
for train_id, test_id in cv.split(input_df.to_numpy(), data_df[y_name].to_numpy()):
# subset the training and test splits within each fold
train_data = data_df.iloc[train_id, :].copy()
test_data = data_df.iloc[test_id, :].copy()
# FIT the model on the TRAINING data within the current FOLD
a_mod = smf.logit(formula=a_formula, data=train_data).fit()
# PREDICT the TRAINING within each FOLD
train_copy = train_data.copy()
train_copy["pred_probability"] = a_mod.predict(train_data)
train_copy["pred_class"] = np.where(train_copy.pred_probability > threshold, 1, 0)
# PREDICT the TEST within each FOLD
test_copy = test_data.copy()
test_copy["pred_probability"] = a_mod.predict(test_data)
test_copy["pred_class"] = np.where(test_copy.pred_probability > threshold, 1, 0)
# calculate the PERFORMANCE METRIC on the TRAINING SET within each FOLD
train_res.append(np.mean(train_copy[y_name] == train_copy.pred_class))
# calculate the PERFORMANCE METRIC on the TESTING SET within each FOLD
test_res.append(np.mean(test_copy[y_name] == test_copy.pred_class))
# Book keeping to store the results
train_df = pd.DataFrame({"Accuracy": train_res})
train_df["from_set"] = "training"
train_df["fold_id"] = train_df.index + 1
test_df = pd.DataFrame({"Accuracy": test_res})
test_df["from_set"] = "testing"
test_df["fold_id"] = test_df.index + 1
# combine the splits together
res_df = pd.concat([train_df, test_df], ignore_index=True)
# add information about the model
res_df["model_name"] = mod_name
res_df["model_formula"] = a_formula
res_df["num_coefs"] = len(a_mod.params)
res_df["threshold"] = threshold
return res_df
results_list = []
for m in range(len(formula_list)):
print("-- Formula ID %d --" % m)
try:
results_list.append(train_and_test_logistic_with_cv(mod_name=m,
a_formula=formula_list[m],
data_df=spotifyclean_df_copy,
x_names=input_names,
y_name=output_name,
cv=kf,
threshold=0.5))
except:
# what happens if there is an error
print("!!! Formula ID %d could NOT fit !!!" % m)
cv_results = pd.concat(results_list, ignore_index=True)
cv_results
Plotting the AVERAGE Accuracy for each Model
sns.catplot(data=cv_results.loc[cv_results.from_set=="testing", :], x="model_name", y="Accuracy",
hue="model_name", kind="point", linestyle="none",
palette="coolwarm",
errorbar=("ci", 68))
plt.show()